---
title: "Manuscript Figures and Tables"
subtitle: "Publication-Ready Outputs for BAM and ICA Papers"
author: "Jessala A. Grijalva"
date: "`r format(Sys.Date(), '%B %d, %Y')`"
format:
  pdf:
    toc: true
    toc-depth: 2
    number-sections: true
    geometry: margin=1in
    keep-tex: false
    fig-pos: "H"
    include-in-header:
      text: |
        \usepackage{booktabs}
        \usepackage{caption}
        \captionsetup[table]{labelfont=bf, labelsep=period, justification=raggedright, singlelinecheck=false}
        \captionsetup[figure]{labelfont=bf, labelsep=period, justification=raggedright, singlelinecheck=false}
execute:
  echo: false
  warning: false
  message: false
  error: false
---

```{r setup}
#| label: setup

library(here)
library(pacman)
p_load(dplyr, tidyr, ggplot2, tibble, forcats, knitr, kableExtra, mclust)

knitr::opts_chunk$set(fig.align = "center")

# Load Phase 2 outputs
load(here("data", "processed", "bam_clustered_gmm_k4.rda"))

# Load pre-computed validation results
val_path <- here("data", "processed", "phase2_validation_results.rda")
if (file.exists(val_path)) {
  load(val_path)
  vr <- validation_results
  cat("Loaded pre-computed validation results.\n")
} else {
  stop("phase2_validation_results.rda not found. Re-render Phase_2_Analysis.qmd first.")
}

VARS5 <- vr$VARS5
df <- bam_clustered %>% filter(!is.na(orientation))

# ── Publishing standards ──
# Grayscale palette (consistent across all figures)
GRAY_PALETTE <- c(
  "Culture Affirming" = "gray15",
  "Assimilationist"   = "gray40",
  "Demicultural"      = "gray65",
  "Bicultural"        = "gray85"
)

SHAPE_PALETTE <- c(
  "Culture Affirming" = 16,   # filled circle
  "Assimilationist"   = 17,   # filled triangle
  "Demicultural"      = 15,   # filled square
  "Bicultural"        = 18    # filled diamond
)

LINETYPE_PALETTE <- c(
  "Culture Affirming" = "solid",
  "Assimilationist"   = "dashed",
  "Demicultural"      = "dotted",
  "Bicultural"        = "dotdash"
)

# Human-readable variable labels
VAR_LABELS <- c(
  "AMERICAN"          = "American Identity",
  "CULTURAL_IDENTITY" = "Cultural Identity",
  "KEEPSPAN"          = "Maintain Spanish",
  "DISTINCT"          = "Cultural Distinction",
  "LEARNENG"          = "Learn English"
)

# Pull key point estimates once
sil_point <- vr$comp_table %>%
  filter(method == "GMM (EEV)", k == 4) %>%
  pull(silhouette)

ch_point <- vr$comp_table %>%
  filter(method == "GMM (EEV)", k == 4) %>%
  pull(ch)

# Check for bootstrap metric CIs (added in Phase 2 update)
has_bt_metrics <- exists("metrics_summary", where = vr)

# Base theme for all figures
theme_pub <- function(base_size = 11) {
  theme_minimal(base_size = base_size) %+replace%
    theme(
      text = element_text(color = "black"),
      panel.grid.minor = element_blank(),
      legend.position = "bottom",
      legend.text = element_text(size = 9),
      plot.title = element_blank(),
      plot.subtitle = element_blank()
    )
}

# Ensure figures/ directory exists
if (!dir.exists(here("figures"))) dir.create(here("figures"), recursive = TRUE)
```

\newpage

# Overview

This document produces the publication-ready figures and tables referenced
in both manuscripts. Each item is labeled by its target paper and
table/figure number. High-resolution standalone PNGs (300 DPI) are
exported to `figures/` for journal production requests.

**Target manuscripts:**

1. *Myth of the Zero-Sum* (BAM) — Table 5, Table 6, Figure 3
2. *Making Cluster Analysis Inferential* (ICA) — Table 1, Table 2, Figure 3

\newpage

# ICA Paper

## Table 1: Cluster analysis performance comparison

```{r ica-table1}
#| label: tbl-ica-table1

comp <- vr$comp_table %>%
  filter(status == "ok") %>%
  select(method, k, silhouette, ch, sizes) %>%
  arrange(method, k) %>%
  mutate(
    silhouette = sprintf("%.2f", silhouette),
    ch         = sprintf("%.2f", ch)
  )

kable(comp,
      col.names = c("Algorithm", "k", "Silhouette", "CH Index", "Cluster Sizes"),
      caption = "Cluster analysis performance comparison across algorithms and k values.",
      align = c("l", "r", "r", "r", "l"),
      booktabs = TRUE) %>%
  kable_styling(latex_options = c("hold_position", "scale_down"),
                full_width = FALSE, font_size = 9) %>%
  row_spec(which(comp$method == "GMM (EEV)" &
                   as.numeric(gsub("[^0-9]", "", comp$k)) == 4),
           bold = TRUE)
```

\noindent \textit{Note:} Silhouette measures cohesion (higher = tighter clusters). CH index measures between- to within-cluster dispersion. Only solutions that converged are shown. \textit{N} = 4,785.

\newpage

## Table 2: Metrics for the optimal k = 4 GMM clustering solution

```{r ica-table2}
#| label: tbl-ica-table2

metrics <- tibble(
  Metric = c(
    "Observations",
    "Clusters (G)",
    "Covariance model",
    "BIC",
    "Log-likelihood",
    "Silhouette",
    "CH index",
    "Bootstrap ARI (B = 1,000)",
    "Bootstrap ARI 95% CI",
    "CV silhouette (5-fold mean)",
    "CV silhouette (5-fold SD)"
  ),
  Value = c(
    format(nrow(df), big.mark = ","),
    "4",
    "EEV",
    sprintf("%.2f", vr$gmm_k4_bic),
    sprintf("%.2f", vr$gmm_k4_loglik),
    sprintf("%.2f", sil_point),
    sprintf("%.2f", ch_point),
    sprintf("%.2f", vr$stab_summary$mean_ARI),
    sprintf("[%.2f, %.2f]", vr$stab_summary$ci_lo, vr$stab_summary$ci_hi),
    sprintf("%.2f", vr$cv_summary$mean_sil_test),
    sprintf("%.2f", vr$cv_summary$sd_sil_test)
  )
)

# Add bootstrap silhouette/Dunn CIs if available
if (has_bt_metrics) {
  ms <- vr$metrics_summary
  extra <- tibble(
    Metric = c(
      "Bootstrap silhouette 95% CI (B = 400)",
      "Bootstrap Dunn index 95% CI (B = 400)"
    ),
    Value = c(
      sprintf("[%.2f, %.2f]", ms$sil_lo[1], ms$sil_hi[1]),
      sprintf("[%.2f, %.2f]", ms$dunn_lo[1], ms$dunn_hi[1])
    )
  )
  idx <- which(metrics$Metric == "CH index")
  metrics <- bind_rows(
    metrics[1:idx, ],
    extra,
    metrics[(idx + 1):nrow(metrics), ]
  )
}

kable(metrics,
      col.names = c("Metric", "Value"),
      caption = "Metrics for the optimal k = 4 GMM clustering solution.",
      align = c("l", "r"),
      booktabs = TRUE) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE)
```

\noindent \textit{Note:} Metrics evaluate k = 4 GMM clustering of standardized acculturation variables. Bootstrap resampling drew 80\% subsamples without replacement. Cross-validation used predict() on held-out folds. Source: 2006 Latino National Survey (\textit{N} = 4,785).

\newpage

## Figure 3: Radar chart of cluster means (k = 4, GMM)

This figure appears as **ICA Figure 3** and **BAM Figure 3** (shared).

```{r ica-fig3}
#| label: fig-radar
#| fig-cap: "Acculturation orientation profiles (z-score means, k = 4 GMM). Dashed horizontal line at zero indicates the sample mean. Culture Affirming (near-black, solid); Assimilationist (dark gray, dashed); Demicultural (medium gray, dotted); Bicultural (light gray, dot-dash)."
#| fig-width: 7
#| fig-height: 4.5

z_long <- vr$z_means %>%
  mutate(variable = recode(variable, !!!VAR_LABELS))

ggplot(z_long, aes(x = variable, y = z_mean, group = orientation,
                    color = orientation, linetype = orientation,
                    shape = orientation)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray60") +
  geom_line(linewidth = 0.9) +
  geom_point(size = 3) +
  scale_color_manual(values = GRAY_PALETTE) +
  scale_shape_manual(values = SHAPE_PALETTE) +
  scale_linetype_manual(values = LINETYPE_PALETTE) +
  labs(x = NULL, y = "Standardized Mean (z-score)",
       color = "Orientation", linetype = "Orientation",
       shape = "Orientation") +
  theme_pub() +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1, size = 10),
    panel.grid.major.x = element_line(color = "gray90")
  )
```

```{r save-radar}
#| label: save-radar

ggsave(
  here("figures", "fig_radar_profiles.png"),
  width = 7, height = 4.5, units = "in", dpi = 300, bg = "white"
)
ggsave(
  here("figures", "fig_radar_profiles_600dpi.png"),
  width = 7, height = 4.5, units = "in", dpi = 600, bg = "white"
)
```

\newpage

# BAM Paper

## Table 5: Metrics for the optimal k = 4 GMM clustering solution

```{r bam-table5}
#| label: tbl-bam-table5

bam_metrics <- tibble(
  Metric = c(
    "Observations",
    "Clusters (G)",
    "Covariance model",
    "Silhouette",
    "CH index",
    "Bootstrap ARI (B = 1,000)",
    "Bootstrap ARI 95% CI",
    "CV silhouette (5-fold mean)",
    "Cluster sizes"
  ),
  Value = c(
    format(nrow(df), big.mark = ","),
    "4",
    "EEV",
    sprintf("%.2f", sil_point),
    sprintf("%.2f", ch_point),
    sprintf("%.2f", vr$stab_summary$mean_ARI),
    sprintf("[%.2f, %.2f]", vr$stab_summary$ci_lo, vr$stab_summary$ci_hi),
    sprintf("%.2f", vr$cv_summary$mean_sil_test),
    paste(sort(table(df$orientation)), collapse = ", ")
  )
)

if (has_bt_metrics) {
  ms <- vr$metrics_summary
  extra <- tibble(
    Metric = c(
      "Bootstrap silhouette 95% CI",
      "Bootstrap Dunn index 95% CI"
    ),
    Value = c(
      sprintf("[%.2f, %.2f]", ms$sil_lo[1], ms$sil_hi[1]),
      sprintf("[%.2f, %.2f]", ms$dunn_lo[1], ms$dunn_hi[1])
    )
  )
  idx <- which(bam_metrics$Metric == "CH index")
  bam_metrics <- bind_rows(
    bam_metrics[1:idx, ],
    extra,
    bam_metrics[(idx + 1):nrow(bam_metrics), ]
  )
}

kable(bam_metrics,
      col.names = c("Metric", "Value"),
      caption = "Metrics for the optimal k = 4 GMM clustering solution.",
      align = c("l", "r"),
      booktabs = TRUE) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE)
```

\noindent \textit{Note:} Metrics evaluate k = 4 GMM clustering of standardized acculturation variables. While scores indicate only moderate cohesion, convergent performance across three algorithms and bootstrapped CIs support solution stability. Source: 2006 Latino National Survey (\textit{N} = 4,785).

\newpage

## Table 6: Mean values of acculturation variables by cluster

```{r bam-table6}
#| label: tbl-bam-table6

means_tbl <- vr$table_means %>%
  select(orientation, n, all_of(VARS5)) %>%
  mutate(across(all_of(VARS5), ~ sprintf("%.2f", .x)))

kable(means_tbl,
      col.names = c("Orientation", "n",
                     "Amer. ID", "Cult. ID", "Keep Span.",
                     "Distinct.", "Learn Eng."),
      caption = "Mean values of acculturation variables by cluster.",
      align = c("l", "r", rep("r", 5)),
      booktabs = TRUE) %>%
  kable_styling(latex_options = c("hold_position"), full_width = FALSE)
```

\noindent \textit{Note:} Amer. ID = American Identity; Cult. ID = Cultural Identity; Keep Span. = Maintain Spanish; Distinct. = Cultural Distinction; Learn Eng. = Learn English. All variables measured on 4-point scales except Cultural Distinction (1--3). Source: 2006 Latino National Survey (\textit{N} = 4,785).

\newpage

## Table 6b: Cluster means with bootstrap 95% CIs

```{r bam-table6b}
#| label: tbl-bam-table6b

ci_tbl <- vr$bt_ci %>%
  mutate(
    formatted = sprintf("%.2f [%.2f, %.2f]", mean, lo, hi),
    variable = recode(variable, !!!VAR_LABELS)
  ) %>%
  select(orientation, variable, formatted) %>%
  pivot_wider(names_from = variable, values_from = formatted)

kable(ci_tbl,
      col.names = c("Orientation", VAR_LABELS[VARS5]),
      caption = "Cluster means with bootstrap 95 percent CIs (B = 400).",
      align = c("l", rep("c", 5)),
      booktabs = TRUE) %>%
  kable_styling(latex_options = c("hold_position", "scale_down"),
                full_width = FALSE, font_size = 9)
```

\noindent \textit{Note:} Values shown as mean [2.5th, 97.5th percentile]. Bootstrap resampled with replacement B = 400 times, computing conditional cluster means using full-sample labels. Source: 2006 Latino National Survey (\textit{N} = 4,785).

\newpage

# Additional Figures for Both Papers

## Heatmap of z-score means

```{r heatmap}
#| label: fig-heatmap
#| fig-cap: "Heatmap of cluster profiles on acculturation variables (z-score means). Darker shading indicates higher values; lighter shading indicates lower values."
#| fig-width: 6
#| fig-height: 4

z_long_heat <- vr$z_means %>%
  mutate(variable = recode(variable, !!!VAR_LABELS))

ggplot(z_long_heat, aes(x = variable, y = orientation, fill = z_mean)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = sprintf("%.2f", z_mean)),
            size = 3.5, fontface = "bold",
            color = ifelse(z_long_heat$z_mean > 0.5 | z_long_heat$z_mean < -0.5,
                           "white", "black")) +
  scale_fill_gradient2(
    low = "gray15", mid = "gray85", high = "gray15",
    midpoint = 0, name = "z-mean",
    guide = guide_colorbar(barwidth = 10)
  ) +
  labs(x = NULL, y = NULL) +
  theme_pub() +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1, size = 10),
    axis.text.y = element_text(size = 10),
    legend.position = "bottom"
  )
```

```{r save-heatmap}
#| label: save-heatmap

ggsave(
  here("figures", "fig_heatmap_profiles.png"),
  width = 6, height = 4, units = "in", dpi = 300, bg = "white"
)
ggsave(
  here("figures", "fig_heatmap_profiles_600dpi.png"),
  width = 6, height = 4, units = "in", dpi = 600, bg = "white"
)
```

\newpage

## PCA scatter plot

```{r pca-scatter}
#| label: fig-pca
#| fig-cap: "PCA projection of acculturation space by GMM k = 4 orientation. Culture Affirming (filled circle); Assimilationist (filled triangle); Demicultural (filled square); Bicultural (filled diamond)."
#| fig-width: 6
#| fig-height: 4.5

X_raw <- df %>%
  select(all_of(VARS5)) %>%
  mutate(across(everything(), as.numeric))

X <- scale(as.matrix(X_raw))
pca <- prcomp(X, center = FALSE, scale. = FALSE)

pca_df <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  Orientation = df$orientation
)

var_explained <- summary(pca)$importance[2, 1:2] * 100

ggplot(pca_df, aes(PC1, PC2, color = Orientation, shape = Orientation)) +
  geom_point(alpha = 0.3, size = 1) +
  scale_color_manual(values = GRAY_PALETTE) +
  scale_shape_manual(values = SHAPE_PALETTE) +
  labs(
    x = sprintf("PC1 (%.1f%%)", var_explained[1]),
    y = sprintf("PC2 (%.1f%%)", var_explained[2]),
    color = "Orientation",
    shape = "Orientation"
  ) +
  theme_pub()
```

```{r save-pca}
#| label: save-pca

ggsave(
  here("figures", "fig_pca_scatter.png"),
  width = 6, height = 4.5, units = "in", dpi = 300, bg = "white"
)
ggsave(
  here("figures", "fig_pca_scatter_600dpi.png"),
  width = 6, height = 4.5, units = "in", dpi = 600, bg = "white"
)
```

\newpage

## Bootstrap ARI distribution

```{r bootstrap-hist}
#| label: fig-bootstrap
#| fig-cap: "Distribution of bootstrap ARI values (B = 1,000 subsample refits). Vertical dashed line marks the mean ARI."
#| fig-width: 5
#| fig-height: 3

stab_ok <- vr$stab %>% filter(ok)

ggplot(stab_ok, aes(x = ari)) +
  geom_histogram(bins = 40, fill = "gray40", color = "white") +
  geom_vline(xintercept = vr$stab_summary$mean_ARI,
             linetype = "dashed", color = "black", linewidth = 0.8) +
  annotate("text",
           x = vr$stab_summary$mean_ARI + 0.02,
           y = Inf, vjust = 1.5, hjust = 0,
           label = sprintf("Mean ARI = %.2f", vr$stab_summary$mean_ARI),
           size = 3.5) +
  labs(x = "ARI vs Full-Sample GMM (k = 4)", y = "Count") +
  theme_pub()
```

```{r save-bootstrap}
#| label: save-bootstrap

ggsave(
  here("figures", "fig_bootstrap_ari.png"),
  width = 5, height = 3, units = "in", dpi = 300, bg = "white"
)
```

\newpage

## ARI agreement across algorithms (ICA)

```{r ari-bar}
#| label: fig-ari
#| fig-cap: "Agreement of k = 4 solutions with GMM reference (Adjusted Rand Index). Higher values indicate greater convergence."
#| fig-width: 5
#| fig-height: 3

ggplot(vr$ari_table, aes(x = reorder(method, ari_vs_gmm), y = ari_vs_gmm)) +
  geom_col(fill = "gray40", width = 0.6) +
  geom_text(aes(label = sprintf("%.2f", ari_vs_gmm)), hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_y_continuous(limits = c(0, 1.05), expand = c(0, 0)) +
  labs(x = NULL, y = "Adjusted Rand Index vs GMM (k = 4)") +
  theme_pub()
```

```{r save-ari}
#| label: save-ari

ggsave(
  here("figures", "fig_ari_agreement.png"),
  width = 5, height = 3, units = "in", dpi = 300, bg = "white"
)
```

\newpage

# Session Info

```{r session-info}
#| label: session-info

sessionInfo()
```
